{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to DataFrames\n",
    "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2017**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "using DataFrames # load package"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Joining DataFrames"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing DataFrames for a join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5×2 DataFrames.DataFrame\n",
       "│ Row │ ID      │ name  │\n",
       "├─────┼─────────┼───────┤\n",
       "│ 1   │ 1       │ Alice │\n",
       "│ 2   │ 2       │ Bob   │\n",
       "│ 3   │ 3       │ Conor │\n",
       "│ 4   │ 4       │ Dave  │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ Zed   │, 5×2 DataFrames.DataFrame\n",
       "│ Row │ id      │ age │\n",
       "├─────┼─────────┼─────┤\n",
       "│ 1   │ 1       │ 21  │\n",
       "│ 2   │ 2       │ 22  │\n",
       "│ 3   │ 5       │ 23  │\n",
       "│ 4   │ 6       │ 24  │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ 99  │)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(ID=[1,2,3,4,missing], name = [\"Alice\", \"Bob\", \"Conor\", \"Dave\",\"Zed\"])\n",
    "y = DataFrame(id=[1,2,5,6,missing], age = [21,22,23,24,99])\n",
    "x,y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>name</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>Alice</td></tr><tr><th>2</th><td>2</td><td>Bob</td></tr><tr><th>3</th><td>3</td><td>Conor</td></tr><tr><th>4</th><td>4</td><td>Dave</td></tr><tr><th>5</th><td>missing</td><td>Zed</td></tr></tbody></table>"
      ],
      "text/plain": [
       "5×2 DataFrames.DataFrame\n",
       "│ Row │ id      │ name  │\n",
       "├─────┼─────────┼───────┤\n",
       "│ 1   │ 1       │ Alice │\n",
       "│ 2   │ 2       │ Bob   │\n",
       "│ 3   │ 3       │ Conor │\n",
       "│ 4   │ 4       │ Dave  │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ Zed   │"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rename!(x, :ID=>:id) # names of columns on which we want to join must be the same"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Standard joins: inner, left, right, outer, semi, anti"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>name</th><th>age</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>Alice</td><td>21</td></tr><tr><th>2</th><td>2</td><td>Bob</td><td>22</td></tr><tr><th>3</th><td>missing</td><td>Zed</td><td>99</td></tr></tbody></table>"
      ],
      "text/plain": [
       "3×3 DataFrames.DataFrame\n",
       "│ Row │ id      │ name  │ age │\n",
       "├─────┼─────────┼───────┼─────┤\n",
       "│ 1   │ 1       │ Alice │ 21  │\n",
       "│ 2   │ 2       │ Bob   │ 22  │\n",
       "│ 3   │ \u001b[90mmissing\u001b[39m │ Zed   │ 99  │"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=:id) # :inner join by default, missing is joined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>name</th><th>age</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>Alice</td><td>21</td></tr><tr><th>2</th><td>2</td><td>Bob</td><td>22</td></tr><tr><th>3</th><td>3</td><td>Conor</td><td>missing</td></tr><tr><th>4</th><td>4</td><td>Dave</td><td>missing</td></tr><tr><th>5</th><td>missing</td><td>Zed</td><td>99</td></tr></tbody></table>"
      ],
      "text/plain": [
       "5×3 DataFrames.DataFrame\n",
       "│ Row │ id      │ name  │ age     │\n",
       "├─────┼─────────┼───────┼─────────┤\n",
       "│ 1   │ 1       │ Alice │ 21      │\n",
       "│ 2   │ 2       │ Bob   │ 22      │\n",
       "│ 3   │ 3       │ Conor │ \u001b[90mmissing\u001b[39m │\n",
       "│ 4   │ 4       │ Dave  │ \u001b[90mmissing\u001b[39m │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ Zed   │ 99      │"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=:id, kind=:left)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>name</th><th>age</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>Alice</td><td>21</td></tr><tr><th>2</th><td>2</td><td>Bob</td><td>22</td></tr><tr><th>3</th><td>missing</td><td>Zed</td><td>99</td></tr><tr><th>4</th><td>5</td><td>missing</td><td>23</td></tr><tr><th>5</th><td>6</td><td>missing</td><td>24</td></tr></tbody></table>"
      ],
      "text/plain": [
       "5×3 DataFrames.DataFrame\n",
       "│ Row │ id      │ name    │ age │\n",
       "├─────┼─────────┼─────────┼─────┤\n",
       "│ 1   │ 1       │ Alice   │ 21  │\n",
       "│ 2   │ 2       │ Bob     │ 22  │\n",
       "│ 3   │ \u001b[90mmissing\u001b[39m │ Zed     │ 99  │\n",
       "│ 4   │ 5       │ \u001b[90mmissing\u001b[39m │ 23  │\n",
       "│ 5   │ 6       │ \u001b[90mmissing\u001b[39m │ 24  │"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=:id, kind=:right)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>name</th><th>age</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>Alice</td><td>21</td></tr><tr><th>2</th><td>2</td><td>Bob</td><td>22</td></tr><tr><th>3</th><td>3</td><td>Conor</td><td>missing</td></tr><tr><th>4</th><td>4</td><td>Dave</td><td>missing</td></tr><tr><th>5</th><td>missing</td><td>Zed</td><td>99</td></tr><tr><th>6</th><td>5</td><td>missing</td><td>23</td></tr><tr><th>7</th><td>6</td><td>missing</td><td>24</td></tr></tbody></table>"
      ],
      "text/plain": [
       "7×3 DataFrames.DataFrame\n",
       "│ Row │ id      │ name    │ age     │\n",
       "├─────┼─────────┼─────────┼─────────┤\n",
       "│ 1   │ 1       │ Alice   │ 21      │\n",
       "│ 2   │ 2       │ Bob     │ 22      │\n",
       "│ 3   │ 3       │ Conor   │ \u001b[90mmissing\u001b[39m │\n",
       "│ 4   │ 4       │ Dave    │ \u001b[90mmissing\u001b[39m │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ Zed     │ 99      │\n",
       "│ 6   │ 5       │ \u001b[90mmissing\u001b[39m │ 23      │\n",
       "│ 7   │ 6       │ \u001b[90mmissing\u001b[39m │ 24      │"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=:id, kind=:outer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>name</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>Alice</td></tr><tr><th>2</th><td>2</td><td>Bob</td></tr><tr><th>3</th><td>missing</td><td>Zed</td></tr></tbody></table>"
      ],
      "text/plain": [
       "3×2 DataFrames.DataFrame\n",
       "│ Row │ id      │ name  │\n",
       "├─────┼─────────┼───────┤\n",
       "│ 1   │ 1       │ Alice │\n",
       "│ 2   │ 2       │ Bob   │\n",
       "│ 3   │ \u001b[90mmissing\u001b[39m │ Zed   │"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=:id, kind=:semi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>name</th></tr></thead><tbody><tr><th>1</th><td>3</td><td>Conor</td></tr><tr><th>2</th><td>4</td><td>Dave</td></tr></tbody></table>"
      ],
      "text/plain": [
       "2×2 DataFrames.DataFrame\n",
       "│ Row │ id │ name  │\n",
       "├─────┼────┼───────┤\n",
       "│ 1   │ 3  │ Conor │\n",
       "│ 2   │ 4  │ Dave  │"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=:id, kind=:anti)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cross join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>a</th><th>b</th><th>c</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>a</td><td>true</td></tr><tr><th>2</th><td>1</td><td>a</td><td>false</td></tr><tr><th>3</th><td>1</td><td>b</td><td>true</td></tr><tr><th>4</th><td>1</td><td>b</td><td>false</td></tr><tr><th>5</th><td>1</td><td>c</td><td>true</td></tr><tr><th>6</th><td>1</td><td>c</td><td>false</td></tr><tr><th>7</th><td>2</td><td>a</td><td>true</td></tr><tr><th>8</th><td>2</td><td>a</td><td>false</td></tr><tr><th>9</th><td>2</td><td>b</td><td>true</td></tr><tr><th>10</th><td>2</td><td>b</td><td>false</td></tr><tr><th>11</th><td>2</td><td>c</td><td>true</td></tr><tr><th>12</th><td>2</td><td>c</td><td>false</td></tr></tbody></table>"
      ],
      "text/plain": [
       "12×3 DataFrames.DataFrame\n",
       "│ Row │ a │ b │ c     │\n",
       "├─────┼───┼───┼───────┤\n",
       "│ 1   │ 1 │ a │ true  │\n",
       "│ 2   │ 1 │ a │ false │\n",
       "│ 3   │ 1 │ b │ true  │\n",
       "│ 4   │ 1 │ b │ false │\n",
       "│ 5   │ 1 │ c │ true  │\n",
       "│ 6   │ 1 │ c │ false │\n",
       "│ 7   │ 2 │ a │ true  │\n",
       "│ 8   │ 2 │ a │ false │\n",
       "│ 9   │ 2 │ b │ true  │\n",
       "│ 10  │ 2 │ b │ false │\n",
       "│ 11  │ 2 │ c │ true  │\n",
       "│ 12  │ 2 │ c │ false │"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# cross-join does not require on argument\n",
    "# it produces a Cartesian product or arguments\n",
    "function expand_grid(;xs...) # a simple replacement for expand.grid in R\n",
    "    reduce((x,y) -> join(x, DataFrame(Pair(y...)), kind=:cross),\n",
    "           DataFrame(Pair(xs[1]...)), xs[2:end])\n",
    "end\n",
    "\n",
    "expand_grid(a=[1,2], b=[\"a\",\"b\",\"c\"], c=[true,false])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Complex cases of joins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6×3 DataFrames.DataFrame\n",
       "│ Row │ id1     │ id2     │ name  │\n",
       "├─────┼─────────┼─────────┼───────┤\n",
       "│ 1   │ 1       │ 1       │ Alice │\n",
       "│ 2   │ 1       │ 11      │ Bob   │\n",
       "│ 3   │ 2       │ 2       │ Conor │\n",
       "│ 4   │ 2       │ 21      │ Dave  │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed   │\n",
       "│ 6   │ \u001b[90mmissing\u001b[39m │ 99      │ Zoe   │, 6×3 DataFrames.DataFrame\n",
       "│ Row │ id1     │ id2     │ age │\n",
       "├─────┼─────────┼─────────┼─────┤\n",
       "│ 1   │ 1       │ 11      │ 21  │\n",
       "│ 2   │ 1       │ 1       │ 22  │\n",
       "│ 3   │ 3       │ 31      │ 23  │\n",
       "│ 4   │ 3       │ 3       │ 24  │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 99  │\n",
       "│ 6   │ \u001b[90mmissing\u001b[39m │ 999     │ 100 │)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(id1=[1,1,2,2,missing,missing],\n",
    "              id2=[1,11,2,21,missing,99],\n",
    "              name = [\"Alice\", \"Bob\", \"Conor\", \"Dave\",\"Zed\", \"Zoe\"])\n",
    "y = DataFrame(id1=[1,1,3,3,missing,missing],\n",
    "              id2=[11,1,31,3,missing,999],\n",
    "              age = [21,22,23,24,99, 100])\n",
    "x,y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id1</th><th>id2</th><th>name</th><th>age</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>1</td><td>Alice</td><td>22</td></tr><tr><th>2</th><td>1</td><td>11</td><td>Bob</td><td>21</td></tr><tr><th>3</th><td>missing</td><td>missing</td><td>Zed</td><td>99</td></tr></tbody></table>"
      ],
      "text/plain": [
       "3×4 DataFrames.DataFrame\n",
       "│ Row │ id1     │ id2     │ name  │ age │\n",
       "├─────┼─────────┼─────────┼───────┼─────┤\n",
       "│ 1   │ 1       │ 1       │ Alice │ 22  │\n",
       "│ 2   │ 1       │ 11      │ Bob   │ 21  │\n",
       "│ 3   │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed   │ 99  │"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=[:id1, :id2]) # joining on two columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id1</th><th>id2</th><th>name</th><th>id2_1</th><th>age</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>1</td><td>Alice</td><td>11</td><td>21</td></tr><tr><th>2</th><td>1</td><td>1</td><td>Alice</td><td>1</td><td>22</td></tr><tr><th>3</th><td>1</td><td>11</td><td>Bob</td><td>11</td><td>21</td></tr><tr><th>4</th><td>1</td><td>11</td><td>Bob</td><td>1</td><td>22</td></tr><tr><th>5</th><td>missing</td><td>missing</td><td>Zed</td><td>missing</td><td>99</td></tr><tr><th>6</th><td>missing</td><td>missing</td><td>Zed</td><td>999</td><td>100</td></tr><tr><th>7</th><td>missing</td><td>99</td><td>Zoe</td><td>missing</td><td>99</td></tr><tr><th>8</th><td>missing</td><td>99</td><td>Zoe</td><td>999</td><td>100</td></tr></tbody></table>"
      ],
      "text/plain": [
       "8×5 DataFrames.DataFrame\n",
       "│ Row │ id1     │ id2     │ name  │ id2_1   │ age │\n",
       "├─────┼─────────┼─────────┼───────┼─────────┼─────┤\n",
       "│ 1   │ 1       │ 1       │ Alice │ 11      │ 21  │\n",
       "│ 2   │ 1       │ 1       │ Alice │ 1       │ 22  │\n",
       "│ 3   │ 1       │ 11      │ Bob   │ 11      │ 21  │\n",
       "│ 4   │ 1       │ 11      │ Bob   │ 1       │ 22  │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed   │ \u001b[90mmissing\u001b[39m │ 99  │\n",
       "│ 6   │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed   │ 999     │ 100 │\n",
       "│ 7   │ \u001b[90mmissing\u001b[39m │ 99      │ Zoe   │ \u001b[90mmissing\u001b[39m │ 99  │\n",
       "│ 8   │ \u001b[90mmissing\u001b[39m │ 99      │ Zoe   │ 999     │ 100 │"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=[:id1], makeunique=true) # with duplicates all combinations are produced (here :inner join)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id1</th><th>id2</th><th>name</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>1</td><td>Alice</td></tr><tr><th>2</th><td>1</td><td>11</td><td>Bob</td></tr><tr><th>3</th><td>missing</td><td>missing</td><td>Zed</td></tr><tr><th>4</th><td>missing</td><td>99</td><td>Zoe</td></tr></tbody></table>"
      ],
      "text/plain": [
       "4×3 DataFrames.DataFrame\n",
       "│ Row │ id1     │ id2     │ name  │\n",
       "├─────┼─────────┼─────────┼───────┤\n",
       "│ 1   │ 1       │ 1       │ Alice │\n",
       "│ 2   │ 1       │ 11      │ Bob   │\n",
       "│ 3   │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed   │\n",
       "│ 4   │ \u001b[90mmissing\u001b[39m │ 99      │ Zoe   │"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "join(x, y, on=[:id1], kind=:semi) # but not by :semi join (as it would duplicate rows)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.6.2",
   "language": "julia",
   "name": "julia-0.6"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
